{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## KNN"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 概览\n",
    "\n",
    "- KNN用于分类和回归，需要考虑最近的邻居\n",
    "- 分类就是分组；回归就是预测结果（数字）\n",
    "- 特征抽取就是将物品的（水果和用户）转化为一系列可比较的数字\n",
    "- 能否挑选合适的特征使馆KNN算法的成败"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 最近邻分类器(1-nearest neighbor classifier)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-30T12:05:23.177908Z",
     "start_time": "2018-12-30T12:05:23.067874Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.973684210526\n"
     ]
    }
   ],
   "source": [
    "from scipy.spatial import distance\n",
    "\n",
    "class ScrappyKNN():\n",
    "    def fit(self, X_train, y_train):\n",
    "        self.X_train = X_train\n",
    "        self.y_train = y_train\n",
    "        \n",
    "    def predict(self, X_test):\n",
    "        predictions = []\n",
    "        for row in X_test:\n",
    "            label = self.closest(row)\n",
    "            predictions.append(label)\n",
    "        return predictions\n",
    "    \n",
    "    def closest(self, row):\n",
    "        best_dist = self.euc(row, self.X_train[0])\n",
    "        best_index = 0\n",
    "        for i in range(1, len(X_train)):\n",
    "            dist = self.euc(row, self.X_train[i])\n",
    "            if dist < best_dist:\n",
    "                best_dist = dist\n",
    "                best_index = i\n",
    "        return self.y_train[best_index]\n",
    "    \n",
    "    def euc(self, a, b):\n",
    "        return distance.euclidean(a, b)\n",
    "    \n",
    "# test\n",
    "from sklearn.datasets import load_iris\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "iris = load_iris()\n",
    "X = iris.data\n",
    "y = iris.target\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y)\n",
    "\n",
    "my_classifier = ScrappyKNN()\n",
    "my_classifier.fit(X_train,y_train)\n",
    "predictions = my_classifier.predict(X_test)\n",
    "\n",
    "print(accuracy_score(y_test, predictions))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### KNN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-12-30T12:15:41.742396Z",
     "start_time": "2018-12-30T12:15:41.629990Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.921052631579\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import operator\n",
    "from scipy.spatial import distance\n",
    "\n",
    "class ScrappyKNN():\n",
    "    def fit(self, X_train, y_train, k):\n",
    "        self.X_train = X_train\n",
    "        self.y_train = y_train\n",
    "        self.k = k\n",
    "        \n",
    "    def predict(self, X_test):\n",
    "        predictions = []\n",
    "        for row in X_test:\n",
    "            label = self.closest_k(row)\n",
    "            predictions.append(label)\n",
    "        return predictions\n",
    "    \n",
    "    def closest_k(self, row):\n",
    "        # distances存储测试点到数据集各个点的距离\n",
    "        distances = []\n",
    "        for i in range(len(X_train)):\n",
    "            distances.append(self.euc(row, self.X_train[i]))\n",
    "        # 转换成数组，对距离排序（从小到大），返回位置信息\n",
    "        distances = np.array(distances)\n",
    "        sortedDistIndicies = distances.argsort()\n",
    "        \n",
    "        classCount = {}\n",
    "        for i in range(self.k):\n",
    "            voteLabel = y_train[sortedDistIndicies[i]]\n",
    "            classCount[voteLabel] = classCount.get(voteLabel, 0) + 1\n",
    "        # 根据“票数”排序\n",
    "        sortedClassCount = sorted(classCount.items(), \n",
    "                                  key=operator.itemgetter(1),\n",
    "                                 reverse=True)\n",
    "        return sortedClassCount[0][0]\n",
    "    \n",
    "    def euc(self, a, b):\n",
    "        return distance.euclidean(a, b)\n",
    "    \n",
    "# test\n",
    "from sklearn.datasets import load_iris\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "iris = load_iris()\n",
    "X = iris.data\n",
    "y = iris.target\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y)\n",
    "\n",
    "my_classifier = ScrappyKNN()\n",
    "my_classifier.fit(X_train,y_train, k=3)\n",
    "predictions = my_classifier.predict(X_test)\n",
    "\n",
    "print(accuracy_score(y_test, predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  },
  "toc": {
   "nav_menu": {},
   "number_sections": false,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
